We are examining the extent to which individuals’ subjective well-being can be predicted by objective factors that determine individual health decline and for which group this declaration bias is particularly large. And in that sense, which predictors (such as age, labor market status, personality traits and hand-grip strength) help improve predictions of individual health declines.
This report documents the steps taken to preprocess and analyze the SOEP data for health-related outcomes: subjective and objective health measures, predictors, and control variables. The transformations include cleaning, imputing missing values, and deriving new variables.
# Load required packages
my_packages <- c("tidyverse", "haven", "openxlsx", "labelled", "dplyr", "lubridate","ggplot2","skimr")
for (p in my_packages) {
if (!require(p, character.only = TRUE)) {
install.packages(p, dependencies = TRUE)
}
library(p, character.only = TRUE)
}## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: haven
##
## Loading required package: openxlsx
##
## Loading required package: labelled
##
## Loading required package: skimr
main <- read_dta('/Users/salmahoumane/Documents/R/pl.dta') %>%
select(
"pid", "syear", "ple0008", "plh0035", "ple0055", "ple0056", "ple0072", "ple0081_h", "ple0086_v4",
"ple0080_v2", "ple0086_v1", "ple0086_v4", paste0("ple00", 11:23),
"ple0004", "ple0005", "ple0095", "plh0182", "plh0171",
"plh0042", "plh0033", "plh0032", "ple0010_h", "ple0006", "ple0007"
)
cat("Main dataset dimensions:", dim(main), "\n")## Main dataset dimensions: 377869 35
# Merge with `pgen` dataset
data_pgen <- read_dta('/Users/salmahoumane/Documents/R/pgen.dta') %>%
select("pid", "syear", "pgerwzeit", "pgemplst", "pglabnet", "pgtatzeit", "pgexpue", "pgstib","pgfamstd")
cat("PGEN dataset dimensions:", dim(data_pgen), "\n")## PGEN dataset dimensions: 381814 9
main_before_merge <- dim(main) # Store dimensions before merge
main <- merge(main, data_pgen, by = c("pid", "syear"), all.x = TRUE)
cat("After merging with pgen: Rows =", nrow(main), "Cols =", ncol(main), "\n")## After merging with pgen: Rows = 377869 Cols = 42
## Same number of rows then the main dataset with 7 new rows -> Merge successful
# Check for duplicate rows introduced during merge
duplicates <- main %>%
group_by(pid, syear) %>%
summarise(count = n()) %>%
filter(count > 1)## `summarise()` has grouped output by 'pid'. You can override using the `.groups`
## argument.
if (nrow(duplicates) > 0) {
cat("Warning: Duplicates detected after merging with pgen!\n")
print(duplicates)
} else {
cat("No duplicates found after merging with pgen.\n")
}## No duplicates found after merging with pgen.
# Merge with `ppathl` dataset
data_ppathl <- read_dta('/Users/salmahoumane/Documents/R/ppathl.dta') %>%
select("pid", "hid", "syear", "psample", "gebjahr", "sex", "migback", "germborn")
cat("PPATHL dataset dimensions:", dim(data_ppathl), "\n")## PPATHL dataset dimensions: 634864 8
main_before_ppathl <- dim(main) # Store dimensions before second merge
main <- left_join(main, data_ppathl, by = c("pid", "syear"))
cat("After merging with ppathl: Rows =", nrow(main), "Cols =", ncol(main), "\n")## After merging with ppathl: Rows = 377869 Cols = 48
## Same number of rows then the main dataset with 6 new rows -> Merge successful
## Final dataset dimensions after all merges: 377869 48
# Clean and rename the `health` variable
main <- main %>%
rename(health = ple0008) %>%
filter(health > 0) %>% # Filter valid health values
mutate(health = 6 - health) # Reverse the scale: 1 -> 5, 5 -> 1
# Visualisation: Bar plot
ggplot(main, aes(x = factor(health))) +
geom_bar(fill = "steelblue") +
labs(
title = "Distribution of Self-Assessed Health",
x = "Health Category (1 = Poor, 5 = Excellent)",
y = "Count"
) +
theme_minimal()# Create `worr_health_categorical` and `worr_health_dummy`
# Calculate the number of missing values in `plh0035` (worries about health) for each year
missing_counts <- main %>%
group_by(syear) %>%
summarise(missing_count = sum(plh0035 < 0, na.rm = TRUE))
# Visualization
print(missing_counts)## # A tibble: 23 × 2
## syear missing_count
## <dbl+lbl> <int>
## 1 1999 14
## 2 2000 78
## 3 2001 85
## 4 2002 58
## 5 2003 45
## 6 2004 57
## 7 2005 49
## 8 2006 65
## 9 2007 57
## 10 2008 35
## # ℹ 13 more rows
## There are a lot of missing in years 2011, 2012 and 2013. Not to lose too many information, we will impute these values within each person with the mean of 2010 and 2014, other missings will be dropped.
main <- main %>%
rename(worr_health_categorical = plh0035) %>%
mutate(
worr_health_categorical = case_when(
worr_health_categorical < 0 ~ NA_real_,
TRUE ~ worr_health_categorical
)
) %>%
group_by(pid) %>%
mutate(
worr_health_categorical = if_else(
syear %in% c(2011, 2012, 2013) & is.na(worr_health_categorical),
floor(
mean(
worr_health_categorical[syear %in% c(2010, 2014)],
na.rm = TRUE
)
),
worr_health_categorical
)
) %>%
ungroup() %>%
drop_na(worr_health_categorical) %>%
mutate(
worr_health_categorical = factor(
worr_health_categorical, levels = c(3, 2, 1),
labels = c("no worries", "worries a little", "worries a lot")
),
worr_health_dummy = if_else(
worr_health_categorical %in% c("worries a lot", "worries a little"), 1, 0
)
) %>%
# Drop any rows with NA in worr_health_categorical
drop_na(worr_health_categorical)
# Visualisation: Bar plot (categorical)
ggplot(main, aes(x = worr_health_categorical)) +
geom_bar(fill = "coral") +
labs(
title = "Distribution of Worries About Health",
x = "Worries About Health",
y = "Count"
) +
theme_minimal()# Visualisation: Pie Char (dummy)
main %>%
group_by(worr_health_dummy) %>%
summarize(count = n()) %>%
mutate(
proportion = count / sum(count),
label = paste0(round(proportion * 100, 1), "%")
) %>%
ggplot(aes(x = "", y = proportion, fill = factor(worr_health_dummy))) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y") +
labs(
title = "Proportion of Health Worry (Dummy)",
fill = "Health Worry (Dummy)"
) +
theme_minimal() +
geom_text(aes(label = label), position = position_stack(vjust = 0.5))# Calculate past health and health differences
main <- main %>%
arrange(pid, syear) %>%
group_by(pid) %>%
mutate(
past_health = lag(health, order_by = syear),
health_diff = health - past_health
) %>%
ungroup()
# Create health decline dummy for 2 years into the future
main <- main %>%
arrange(pid, syear) %>%
group_by(pid) %>%
mutate(
health_in_2yrs = lead(health, n = 2, order_by = syear),
health_decline_2yrs = case_when(
!is.na(health_in_2yrs) & health_in_2yrs < health ~ 1,
!is.na(health_in_2yrs) & health_in_2yrs >= health ~ 0,
TRUE ~ NA_real_
)
) %>%
ungroup()
#Visualisation: Line plot
# Line plot: Average health over years
main %>%
group_by(syear) %>%
summarize(avg_health = mean(health, na.rm = TRUE)) %>%
ggplot(aes(x = syear, y = avg_health)) +
geom_line(color = "darkgreen") +
geom_point(color = "darkgreen") +
labs(
title = "Average Self-Assessed Health Over Years",
x = "Survey Year",
y = "Average Health"
) +
theme_minimal()# Process grip strength data and merge with main
gripstr <- read_dta("/Users/salmahoumane/Documents/R/gripstr.dta") %>%
select(pid, syear, gs01, gs03, gs04, gs05, gs06) %>%
rename(
gripr1 = gs03, gripr2 = gs04,
gripl1 = gs05, gripl2 = gs06
) %>%
filter(gs01 %in% c(1, 2, 3)) %>%
mutate(across(starts_with("grip"), ~ ifelse(. < 0, NA_real_, .))) %>%
mutate(
avg_gripr = rowMeans(cbind(gripr1, gripr2), na.rm = TRUE),
avg_gripl = rowMeans(cbind(gripl1, gripl2), na.rm = TRUE),
avg_grip = case_when(
gs01 == 1 ~ avg_gripr,
gs01 == 2 ~ avg_gripl,
gs01 == 3 ~ rowMeans(cbind(avg_gripr, avg_gripl), na.rm = TRUE),
TRUE ~ NA_real_
)
)
main <- main %>%
left_join(gripstr, by = c("pid", "syear"))
# Calculate the number of missing values in grip strength columns for each year
missing_counts_gripstr <- main %>%
group_by(syear) %>%
summarise(missing_avg_grip = sum(is.na(avg_grip)))
# Display the results
print(missing_counts_gripstr)## # A tibble: 23 × 2
## syear missing_avg_grip
## <dbl+lbl> <int>
## 1 1999 6890
## 2 2000 12092
## 3 2001 10989
## 4 2002 11805
## 5 2003 11174
## 6 2004 10857
## 7 2005 10371
## 8 2006 8621
## 9 2007 10353
## 10 2008 7284
## # ℹ 13 more rows
# Visualisation: Histogram
ggplot(main, aes(x = avg_grip)) +
geom_histogram(binwidth = 20, fill = "steelblue", color = "black", alpha = 0.7) +
labs(
title = "Distribution of Average Grip Strength",
x = "Average Grip Strength",
y = "Count"
) +
theme_minimal()## Warning: Removed 264444 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Create `hospital_stay` variable
# Step 1: Track Missing Categories Before Imputation
missing_breakdown_before <- main %>%
summarise(
ple0055_NA = sum(is.na(ple0055)),
ple0055_not_asked = sum(ple0055 %in% c(-8, -5), na.rm = TRUE),
ple0055_other_neg = sum(ple0055 < 0 & !(ple0055 %in% c(-8, -5)), na.rm = TRUE),
ple0056_NA = sum(is.na(ple0056)),
ple0056_not_asked = sum(ple0056 %in% c(-8, -5), na.rm = TRUE),
ple0056_other_neg = sum(ple0056 < 0 & !(ple0056 %in% c(-8, -5)), na.rm = TRUE)
)
print("Breakdown of missing values before imputation:")## [1] "Breakdown of missing values before imputation:"
## # A tibble: 1 × 6
## ple0055_NA ple0055_not_asked ple0055_other_neg ple0056_NA ple0056_not_asked
## <int> <int> <int> <int> <int>
## 1 0 16281 235480 0 16281
## # ℹ 1 more variable: ple0056_other_neg <int>
# Step 2: Impute Values for ple0055 and ple0056 (Capping Lag/Lead to 2 Years)
main <- main %>%
arrange(pid, syear) %>%
group_by(pid) %>%
mutate(
ple0055_flag = ifelse(ple0055 %in% c(-8, -5), 1, 0), # Flag imputations
ple0056_flag = ifelse(ple0056 %in% c(-8, -5), 1, 0),
ple0055 = case_when(
ple0055 %in% c(-8, -5) ~ {
lag_val <- if_else(syear - lag(syear) <= 2, lag(ple0055), NA_real_)
lead_val <- if_else(lead(syear) - syear <= 2, lead(ple0055), NA_real_)
coalesce(lag_val, lead_val)
},
TRUE ~ ple0055
),
ple0056 = case_when(
ple0056 %in% c(-8, -5) ~ {
lag_val <- if_else(syear - lag(syear) <= 2, lag(ple0056), NA_real_)
lead_val <- if_else(lead(syear) - syear <= 2, lead(ple0056), NA_real_)
coalesce(lag_val, lead_val)
},
TRUE ~ ple0056
),
imputed_flag = ifelse(ple0055_flag == 1 | ple0056_flag == 1, 1, 0) # Combined flag
) %>%
ungroup()
# Step 3: Filter Valid Rows and Calculate Hospital Stay
main <- main %>%
filter(!is.na(ple0055) & !is.na(ple0056)) %>%
mutate(hospital_stay = ple0055 + ple0056) %>%
filter(hospital_stay >= 0)
# Step 4: Imputed and Non-Imputed Statistics
# Imputed Rows
imputed_stats <- main %>%
filter(imputed_flag == 1) %>%
summarise(
Mean = mean(hospital_stay, na.rm = TRUE),
Median = median(hospital_stay, na.rm = TRUE),
Q1 = quantile(hospital_stay, 0.25, na.rm = TRUE),
Q3 = quantile(hospital_stay, 0.75, na.rm = TRUE),
Min = min(hospital_stay, na.rm = TRUE),
Max = max(hospital_stay, na.rm = TRUE),
Count = n()
)
# Non-Imputed Rows
non_imputed_stats <- main %>%
filter(imputed_flag == 0) %>%
summarise(
Mean = mean(hospital_stay, na.rm = TRUE),
Median = median(hospital_stay, na.rm = TRUE),
Q1 = quantile(hospital_stay, 0.25, na.rm = TRUE),
Q3 = quantile(hospital_stay, 0.75, na.rm = TRUE),
Min = min(hospital_stay, na.rm = TRUE),
Max = max(hospital_stay, na.rm = TRUE),
Count = n()
)
print("Descriptive Statistics for Imputed Rows:")## [1] "Descriptive Statistics for Imputed Rows:"
## # A tibble: 1 × 7
## Mean Median Q1 Q3 Min Max Count
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 8.73 6 4 9 0 110 249
## [1] "Descriptive Statistics for Non-Imputed Rows:"
## # A tibble: 1 × 7
## Mean Median Q1 Q3 Min Max Count
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 13.2 8 4 15 0 351 33900
# Step 5: Recheck Total Imputed Counts
imputation_counts <- main %>%
summarise(
Total_Imputed_Rows = sum(imputed_flag),
Total_Rows = n()
)
print("Imputation Counts:")## [1] "Imputation Counts:"
## # A tibble: 1 × 2
## Total_Imputed_Rows Total_Rows
## <dbl> <int>
## 1 249 34149
# Step 6: Visualizations
# All rows
ggplot(main, aes(x = hospital_stay)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(title = "Hospital Stays - All Rows", x = "Days in Hospital", y = "Count")# Imputed rows
ggplot(main %>% filter(imputed_flag == 1), aes(x = hospital_stay)) +
geom_histogram(binwidth = 1, fill = "orange", color = "black") +
labs(title = "Hospital Stays - Imputed Rows", x = "Days in Hospital", y = "Count")# Non-Imputed rows
ggplot(main %>% filter(imputed_flag == 0), aes(x = hospital_stay)) +
geom_histogram(binwidth = 1, fill = "green", color = "black") +
labs(title = "Hospital Stays - Non-Imputed Rows", x = "Days in Hospital", y = "Count")# Create impaired_stairs and impaired_tiring_tasks
# Step 1: Track Missing Categories Before Imputation
missing_breakdown_before <- main %>%
summarise(
ple0004_NA = sum(is.na(ple0004)),
ple0004_not_asked = sum(ple0004 %in% c(-5, -8), na.rm = TRUE),
ple0004_other_neg = sum(ple0004 < 0 & !(ple0004 %in% c(-5, -8)), na.rm = TRUE),
ple0005_NA = sum(is.na(ple0005)),
ple0005_not_asked = sum(ple0005 %in% c(-5, -8), na.rm = TRUE),
ple0005_other_neg = sum(ple0005 < 0 & !(ple0005 %in% c(-5, -8)), na.rm = TRUE)
)
print("Breakdown of missing values before imputation:")## [1] "Breakdown of missing values before imputation:"
## # A tibble: 1 × 6
## ple0004_NA ple0004_not_asked ple0004_other_neg ple0005_NA ple0005_not_asked
## <int> <int> <int> <int> <int>
## 1 0 19692 29 0 19692
## # ℹ 1 more variable: ple0005_other_neg <int>
# Step 2: Impute ple0004 and ple0005 with 2-Year Cap
main <- main %>%
arrange(pid, syear) %>%
group_by(pid) %>%
mutate(
ple0004_imputed = ifelse(ple0004 %in% c(-5, -8), 1, 0),
ple0005_imputed = ifelse(ple0005 %in% c(-5, -8), 1, 0),
imputed_flag = ifelse(ple0004_imputed == 1 | ple0005_imputed == 1, 1, 0),
ple0004 = case_when(
ple0004 < -2 ~ NA_real_,
ple0004 %in% c(-5, -8) ~ {
lag_val <- if_else(syear - lag(syear) <= 2, lag(ple0004), NA_real_)
lead_val <- if_else(lead(syear) - syear <= 2, lead(ple0004), NA_real_)
coalesce(lag_val, lead_val)
},
TRUE ~ ple0004
),
ple0005 = case_when(
ple0005 < -2 ~ NA_real_,
ple0005 %in% c(-5, -8) ~ {
lag_val <- if_else(syear - lag(syear) <= 2, lag(ple0005), NA_real_)
lead_val <- if_else(lead(syear) - syear <= 2, lead(ple0005), NA_real_)
coalesce(lag_val, lead_val)
},
TRUE ~ ple0005
)
) %>%
ungroup() %>%
mutate(
impaired_stairs = ifelse(ple0004 %in% c(1, 2), 1, 0),
impaired_tiring_tasks = ifelse(ple0005 %in% c(1, 2), 1, 0)
)
# Step 3: Track Missing Categories After Imputation
missing_breakdown_after <- main %>%
summarise(
ple0004_NA = sum(is.na(ple0004)),
ple0005_NA = sum(is.na(ple0005))
)
print("Breakdown of missing values after imputation:")## [1] "Breakdown of missing values after imputation:"
## # A tibble: 1 × 2
## ple0004_NA ple0005_NA
## <int> <int>
## 1 19692 19692
# Step 4: Descriptive Statistics
# Descriptive statistics for imputed rows
imputed_stats <- main %>%
filter(imputed_flag == 1) %>%
summarise(
Impaired_Stairs_Mean = mean(impaired_stairs, na.rm = TRUE),
Impaired_Stairs_Median = median(impaired_stairs, na.rm = TRUE),
Impaired_Stairs_Q1 = quantile(impaired_stairs, 0.25, na.rm = TRUE),
Impaired_Stairs_Q3 = quantile(impaired_stairs, 0.75, na.rm = TRUE),
Impaired_Tasks_Mean = mean(impaired_tiring_tasks, na.rm = TRUE),
Impaired_Tasks_Median = median(impaired_tiring_tasks, na.rm = TRUE),
Impaired_Tasks_Q1 = quantile(impaired_tiring_tasks, 0.25, na.rm = TRUE),
Impaired_Tasks_Q3 = quantile(impaired_tiring_tasks, 0.75, na.rm = TRUE),
Count = n()
)
# Descriptive statistics for non-imputed rows
non_imputed_stats <- main %>%
filter(imputed_flag == 0) %>%
summarise(
Impaired_Stairs_Mean = mean(impaired_stairs, na.rm = TRUE),
Impaired_Stairs_Median = median(impaired_stairs, na.rm = TRUE),
Impaired_Stairs_Q1 = quantile(impaired_stairs, 0.25, na.rm = TRUE),
Impaired_Stairs_Q3 = quantile(impaired_stairs, 0.75, na.rm = TRUE),
Impaired_Tasks_Mean = mean(impaired_tiring_tasks, na.rm = TRUE),
Impaired_Tasks_Median = median(impaired_tiring_tasks, na.rm = TRUE),
Impaired_Tasks_Q1 = quantile(impaired_tiring_tasks, 0.25, na.rm = TRUE),
Impaired_Tasks_Q3 = quantile(impaired_tiring_tasks, 0.75, na.rm = TRUE),
Count = n()
)
print("Descriptive Statistics for Imputed Rows:")## [1] "Descriptive Statistics for Imputed Rows:"
## # A tibble: 1 × 9
## Impaired_Stairs_Mean Impaired_Stairs_Median Impaired_Stairs_Q1
## <dbl> <dbl> <dbl>
## 1 0 0 0
## # ℹ 6 more variables: Impaired_Stairs_Q3 <dbl>, Impaired_Tasks_Mean <dbl>,
## # Impaired_Tasks_Median <dbl>, Impaired_Tasks_Q1 <dbl>,
## # Impaired_Tasks_Q3 <dbl>, Count <int>
## [1] "Descriptive Statistics for Non-Imputed Rows:"
## # A tibble: 1 × 9
## Impaired_Stairs_Mean Impaired_Stairs_Median Impaired_Stairs_Q1
## <dbl> <dbl> <dbl>
## 1 0.592 1 0
## # ℹ 6 more variables: Impaired_Stairs_Q3 <dbl>, Impaired_Tasks_Mean <dbl>,
## # Impaired_Tasks_Median <dbl>, Impaired_Tasks_Q1 <dbl>,
## # Impaired_Tasks_Q3 <dbl>, Count <int>
# Step 5: Visualizations
# Impaired Stairs
# All rows
ggplot(main, aes(x = factor(impaired_stairs))) +
geom_bar(fill = "skyblue", color = "black") +
labs(
title = "Impaired Stairs - All Rows",
x = "Impaired Stairs (1 = Yes, 0 = No)",
y = "Count"
) +
theme_minimal()# Imputed rows only
ggplot(main %>% filter(imputed_flag == 1), aes(x = factor(impaired_stairs))) +
geom_bar(fill = "orange", color = "black") +
labs(
title = "Impaired Stairs - Imputed Rows",
x = "Impaired Stairs (1 = Yes, 0 = No)",
y = "Count"
) +
theme_minimal()# Non-imputed rows only
ggplot(main %>% filter(imputed_flag == 0), aes(x = factor(impaired_stairs))) +
geom_bar(fill = "green", color = "black") +
labs(
title = "Impaired Stairs - Non-Imputed Rows",
x = "Impaired Stairs (1 = Yes, 0 = No)",
y = "Count"
) +
theme_minimal()# Impaired Tiring Tasks
# All rows
ggplot(main, aes(x = factor(impaired_tiring_tasks))) +
geom_bar(fill = "coral", color = "black") +
labs(
title = "Impaired Tiring Tasks - All Rows",
x = "Impaired Tasks (1 = Yes, 0 = No)",
y = "Count"
) +
theme_minimal()# Imputed rows only
ggplot(main %>% filter(imputed_flag == 1), aes(x = factor(impaired_tiring_tasks))) +
geom_bar(fill = "orange", color = "black") +
labs(
title = "Impaired Tiring Tasks - Imputed Rows",
x = "Impaired Tasks (1 = Yes, 0 = No)",
y = "Count"
) +
theme_minimal()# Non-imputed rows only
ggplot(main %>% filter(imputed_flag == 0), aes(x = factor(impaired_tiring_tasks))) +
geom_bar(fill = "green", color = "black") +
labs(
title = "Impaired Tiring Tasks - Non-Imputed Rows",
x = "Impaired Tasks (1 = Yes, 0 = No)",
y = "Count"
) +
theme_minimal()# Create illnesses variable
illness_vars <- c("ple0011", "ple0012", "ple0013", "ple0014", "ple0015",
"ple0016", "ple0017", "ple0018", "ple0019", "ple0020",
"ple0021", "ple0022", "ple0023")
# Step 1: Track Missing Categories Before Imputation
missing_breakdown_before <- main %>%
summarise(across(
all_of(illness_vars),
list(
NA_count = function(x) sum(is.na(x)), # Count NAs
not_asked = function(x) sum(x %in% c(-8, -5), na.rm = TRUE), # Count -8, -5
other_neg = function(x) sum(x < 0 & !(x %in% c(-8, -5)), na.rm = TRUE) # Count other negatives
)
))
print("Breakdown of missing values before imputation:")## [1] "Breakdown of missing values before imputation:"
## # A tibble: 1 × 39
## ple0011_NA_count ple0011_not_asked ple0011_other_neg ple0012_NA_count
## <int> <int> <int> <int>
## 1 0 25068 7535 0
## # ℹ 35 more variables: ple0012_not_asked <int>, ple0012_other_neg <int>,
## # ple0013_NA_count <int>, ple0013_not_asked <int>, ple0013_other_neg <int>,
## # ple0014_NA_count <int>, ple0014_not_asked <int>, ple0014_other_neg <int>,
## # ple0015_NA_count <int>, ple0015_not_asked <int>, ple0015_other_neg <int>,
## # ple0016_NA_count <int>, ple0016_not_asked <int>, ple0016_other_neg <int>,
## # ple0017_NA_count <int>, ple0017_not_asked <int>, ple0017_other_neg <int>,
## # ple0018_NA_count <int>, ple0018_not_asked <int>, ple0018_other_neg <int>, …
# Step 2: Impute Illness Variables with Max 2-Year Lag/Lead
main <- main %>%
arrange(pid, syear) %>%
group_by(pid) %>%
mutate(across(
all_of(illness_vars),
~ case_when(
. %in% c(-8, -5) ~ {
lag_val <- if_else(syear - lag(syear) <= 2, lag(., order_by = syear), NA_real_)
lead_val <- if_else(lead(syear) - syear <= 2, lead(., order_by = syear), NA_real_)
if_else(!is.na(lag_val) & !is.na(lead_val), (lag_val + lead_val) / 2, coalesce(lag_val, lead_val))
},
. == -2 ~ 0, # Set -2 to 0
TRUE ~ . # Keep valid values
),
.names = "{.col}_imputed"
)) %>%
ungroup() %>%
mutate(across(all_of(illness_vars), ~ if_else(. < 0, NA_real_, .))) %>%
mutate(
illnesses = rowSums(across(all_of(illness_vars)), na.rm = TRUE),
imputed_flag = if_else(rowSums(across(ends_with("_imputed"), ~ . %in% c(-8, -5))) > 0, 1, 0)
) %>%
mutate(illnesses = round(illnesses))
# Step 3: Breakdown After Imputation
missing_breakdown_after <- main %>%
summarise(across(
all_of(illness_vars),
~ sum(is.na(.)), .names = "{.col}_NA_after"
))
print("Breakdown of missing values after imputation:")## [1] "Breakdown of missing values after imputation:"
## # A tibble: 1 × 13
## ple0011_NA_after ple0012_NA_after ple0013_NA_after ple0014_NA_after
## <int> <int> <int> <int>
## 1 32603 32704 33237 31912
## # ℹ 9 more variables: ple0015_NA_after <int>, ple0016_NA_after <int>,
## # ple0017_NA_after <int>, ple0018_NA_after <int>, ple0019_NA_after <int>,
## # ple0020_NA_after <int>, ple0021_NA_after <int>, ple0022_NA_after <int>,
## # ple0023_NA_after <int>
# Step 4: Descriptive Statistics for Imputed and Non-Imputed Rows
# Imputed Rows
imputed_stats <- main %>%
filter(imputed_flag == 1) %>%
summarise(
Mean_Illnesses = mean(illnesses, na.rm = TRUE),
Median_Illnesses = median(illnesses, na.rm = TRUE),
Q1 = quantile(illnesses, 0.25, na.rm = TRUE),
Q3 = quantile(illnesses, 0.75, na.rm = TRUE),
Min = min(illnesses, na.rm = TRUE),
Max = max(illnesses, na.rm = TRUE),
Count = n()
)
# Non-Imputed Rows
non_imputed_stats <- main %>%
filter(imputed_flag == 0) %>%
summarise(
Mean_Illnesses = mean(illnesses, na.rm = TRUE),
Median_Illnesses = median(illnesses, na.rm = TRUE),
Q1 = quantile(illnesses, 0.25, na.rm = TRUE),
Q3 = quantile(illnesses, 0.75, na.rm = TRUE),
Min = min(illnesses, na.rm = TRUE),
Max = max(illnesses, na.rm = TRUE),
Count = n()
)
print("Descriptive Statistics for Imputed Rows:")## [1] "Descriptive Statistics for Imputed Rows:"
## # A tibble: 1 × 7
## Mean_Illnesses Median_Illnesses Q1 Q3 Min Max Count
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 0.106 0 0 0 0 6 10190
## [1] "Descriptive Statistics for Non-Imputed Rows:"
## # A tibble: 1 × 7
## Mean_Illnesses Median_Illnesses Q1 Q3 Min Max Count
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 0.862 0 0 1 0 11 23959
# Step 5: Visualizations
# All Rows
ggplot(main, aes(x = illnesses)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(
title = "Distribution of Illness Counts - All Rows",
x = "Number of Illnesses",
y = "Count"
) +
theme_minimal()# Imputed Rows Only
ggplot(main %>% filter(imputed_flag == 1), aes(x = illnesses)) +
geom_histogram(binwidth = 1, fill = "orange", color = "black") +
labs(
title = "Distribution of Illness Counts - Imputed Rows",
x = "Number of Illnesses",
y = "Count"
) +
theme_minimal()# Non-Imputed Rows Only
ggplot(main %>% filter(imputed_flag == 0), aes(x = illnesses)) +
geom_histogram(binwidth = 1, fill = "green", color = "black") +
labs(
title = "Distribution of Illness Counts - Non-Imputed Rows",
x = "Number of Illnesses",
y = "Count"
) +
theme_minimal()# Create life_satisfaction and health_satisfaction variables
# Step 1: Track Missing Categories Before Imputation
missing_breakdown_before <- main %>%
summarise(
plh0182_NA = sum(is.na(plh0182)),
plh0182_not_asked = sum(plh0182 %in% c(-8, -5), na.rm = TRUE),
plh0182_other_neg = sum(plh0182 < 0 & !(plh0182 %in% c(-8, -5)), na.rm = TRUE),
plh0171_NA = sum(is.na(plh0171)),
plh0171_not_asked = sum(plh0171 %in% c(-8, -5), na.rm = TRUE),
plh0171_other_neg = sum(plh0171 < 0 & !(plh0171 %in% c(-8, -5)), na.rm = TRUE)
)
print("Breakdown of missing values before imputation:")## [1] "Breakdown of missing values before imputation:"
## # A tibble: 1 × 6
## plh0182_NA plh0182_not_asked plh0182_other_neg plh0171_NA plh0171_not_asked
## <int> <int> <int> <int> <int>
## 1 0 296 74 0 394
## # ℹ 1 more variable: plh0171_other_neg <int>
# Step 2: Impute Values for plh0182 and plh0171 with Max 2-Year Lag/Lead
main <- main %>%
arrange(pid, syear) %>% # Ensure proper sorting
group_by(pid) %>%
mutate(
# Flag rows that are imputed
imputed_flag_life = if_else(plh0182 %in% c(-5, -8) | is.na(plh0182), 1, 0),
imputed_flag_health = if_else(plh0171 %in% c(-5, -8) | is.na(plh0171), 1, 0),
# Replace -5 and -8 with NA
plh0182 = case_when(plh0182 %in% c(-5, -8) ~ NA_real_, TRUE ~ plh0182),
plh0171 = case_when(plh0171 %in% c(-5, -8) ~ NA_real_, TRUE ~ plh0171),
# Impute plh0182 (life satisfaction)
life_satisfaction = if_else(
is.na(plh0182),
coalesce(
if_else(syear - lag(syear) <= 2 & !is.na(lag(plh0182)), lag(plh0182), NA_real_),
if_else(lead(syear) - syear <= 2 & !is.na(lead(plh0182)), lead(plh0182), NA_real_)
),
plh0182
),
# Impute plh0171 (health satisfaction)
health_satisfaction = if_else(
is.na(plh0171),
coalesce(
if_else(syear - lag(syear) <= 2 & !is.na(lag(plh0171)), lag(plh0171), NA_real_),
if_else(lead(syear) - syear <= 2 & !is.na(lead(plh0171)), lead(plh0171), NA_real_)
),
plh0171
)
) %>%
ungroup()
# Step 3: Breakdown After Imputation
missing_breakdown_after <- main %>%
summarise(
life_satisfaction_NA = sum(is.na(life_satisfaction)),
health_satisfaction_NA = sum(is.na(health_satisfaction))
)
print("Breakdown of missing values after imputation:")## [1] "Breakdown of missing values after imputation:"
## # A tibble: 1 × 2
## life_satisfaction_NA health_satisfaction_NA
## <int> <int>
## 1 185 185
# Step 4: Descriptive Statistics
# Imputed Rows
imputed_stats <- main %>%
filter(imputed_flag_life == 1 | imputed_flag_health == 1) %>%
summarise(
Life_Satisfaction_Mean = mean(life_satisfaction, na.rm = TRUE),
Life_Satisfaction_Median = median(life_satisfaction, na.rm = TRUE),
Health_Satisfaction_Mean = mean(health_satisfaction, na.rm = TRUE),
Health_Satisfaction_Median = median(health_satisfaction, na.rm = TRUE),
Count = n()
)
# Non-Imputed Rows
non_imputed_stats <- main %>%
filter(imputed_flag_life == 0 & imputed_flag_health == 0) %>%
summarise(
Life_Satisfaction_Mean = mean(life_satisfaction, na.rm = TRUE),
Life_Satisfaction_Median = median(life_satisfaction, na.rm = TRUE),
Health_Satisfaction_Mean = mean(health_satisfaction, na.rm = TRUE),
Health_Satisfaction_Median = median(health_satisfaction, na.rm = TRUE),
Count = n()
)
print("Descriptive Statistics for Imputed Rows:")## [1] "Descriptive Statistics for Imputed Rows:"
## # A tibble: 1 × 5
## Life_Satisfaction_Mean Life_Satisfaction_Median Health_Satisfaction_Mean
## <dbl> <dbl> <dbl>
## 1 7.20 8 6.08
## # ℹ 2 more variables: Health_Satisfaction_Median <dbl>, Count <int>
## [1] "Descriptive Statistics for Non-Imputed Rows:"
## # A tibble: 1 × 5
## Life_Satisfaction_Mean Life_Satisfaction_Median Health_Satisfaction_Mean
## <dbl> <dbl> <dbl>
## 1 6.77 7 5.55
## # ℹ 2 more variables: Health_Satisfaction_Median <dbl>, Count <int>
# Step 5: Visualizations
# All Rows: Life Satisfaction
ggplot(main, aes(x = factor(life_satisfaction))) +
geom_bar(fill = "skyblue", color = "black") +
labs(
title = "Life Satisfaction - All Rows",
x = "Life Satisfaction (0-10)",
y = "Count"
) +
theme_minimal()# Imputed Rows Only
ggplot(main %>% filter(imputed_flag_life == 1), aes(x = factor(life_satisfaction))) +
geom_bar(fill = "orange", color = "black") +
labs(
title = "Life Satisfaction - Imputed Rows",
x = "Life Satisfaction (0-10)",
y = "Count"
) +
theme_minimal()# Non-Imputed Rows Only
ggplot(main %>% filter(imputed_flag_life == 0), aes(x = factor(life_satisfaction))) +
geom_bar(fill = "green", color = "black") +
labs(
title = "Life Satisfaction - Non-Imputed Rows",
x = "Life Satisfaction (0-10)",
y = "Count"
) +
theme_minimal()# All Rows: Health Satisfaction
ggplot(main, aes(x = factor(health_satisfaction))) +
geom_bar(fill = "coral", color = "black") +
labs(
title = "Health Satisfaction - All Rows",
x = "Health Satisfaction (0-10)",
y = "Count"
) +
theme_minimal()# Imputed Rows Only
ggplot(main %>% filter(imputed_flag_health == 1), aes(x = factor(health_satisfaction))) +
geom_bar(fill = "orange", color = "black") +
labs(
title = "Health Satisfaction - Imputed Rows",
x = "Health Satisfaction (0-10)",
y = "Count"
) +
theme_minimal()# Non-Imputed Rows Only
ggplot(main %>% filter(imputed_flag_health == 0), aes(x = factor(health_satisfaction))) +
geom_bar(fill = "green", color = "black") +
labs(
title = "Health Satisfaction - Non-Imputed Rows",
x = "Health Satisfaction (0-10)",
y = "Count"
) +
theme_minimal()# Process height and weight
# Step 1: Track Missing Categories Before Imputation
missing_breakdown_before <- main %>%
summarise(
height_NA = sum(is.na(ple0006)),
weight_NA = sum(is.na(ple0007)),
height_invalid = sum(ple0006 > 245 | ple0006 <= 0, na.rm = TRUE),
weight_invalid = sum(ple0007 <= 0, na.rm = TRUE)
)
print("Breakdown of missing and invalid values before imputation:")## [1] "Breakdown of missing and invalid values before imputation:"
## # A tibble: 1 × 4
## height_NA weight_NA height_invalid weight_invalid
## <int> <int> <int> <int>
## 1 0 0 19951 20062
# Step 2: Process and Impute Height with Lag/Lead (<= 2 years)
main <- main %>%
arrange(pid, syear) %>%
group_by(pid) %>%
mutate(
# Track imputation flag for height
height_imputed = if_else(ple0006 > 245 | ple0006 <= 0 | is.na(ple0006), 1, 0),
# Handle valid height and imputation
height = case_when(
ple0006 > 245 ~ NA_real_,
ple0006 > 0 ~ ple0006,
TRUE ~ NA_real_
),
height = case_when(
is.na(height) & syear - lag(syear) <= 2 ~ lag(height),
is.na(height) & lead(syear) - syear <= 2 ~ lead(height),
TRUE ~ height
)
) %>%
ungroup()
# Step 3: Process and Impute Weight with Lag/Lead (<= 2 years)
main <- main %>%
arrange(pid, syear) %>%
group_by(pid) %>%
mutate(
# Track imputation flag for weight
weight_imputed = if_else(ple0007 <= 0 | is.na(ple0007), 1, 0),
# Handle valid weight and imputation
weight = case_when(
ple0007 > 0 ~ ple0007,
TRUE ~ NA_real_
),
weight = case_when(
is.na(weight) & syear - lag(syear) <= 2 ~ lag(weight),
is.na(weight) & lead(syear) - syear <= 2 ~ lead(weight),
TRUE ~ weight
)
) %>%
ungroup()
# Step 4: Track Missing Categories After Imputation
missing_breakdown_after <- main %>%
summarise(
height_NA = sum(is.na(height)),
weight_NA = sum(is.na(weight)),
height_imputed = sum(height_imputed),
weight_imputed = sum(weight_imputed)
)
print("Breakdown of missing values after imputation:")## [1] "Breakdown of missing values after imputation:"
## # A tibble: 1 × 4
## height_NA weight_NA height_imputed weight_imputed
## <int> <int> <dbl> <dbl>
## 1 13743 13880 19951 20062
# Step 5: Calculate BMI and Create Categorical Variable
main <- main %>%
mutate(
BMI = weight / ((height / 100) ^ 2), # Calculate BMI
bmi_categorical = case_when(
BMI < 18.5 ~ "Underweight",
BMI >= 18.5 & BMI < 25 ~ "Normal weight",
BMI >= 25 & BMI < 30 ~ "Overweight",
BMI >= 30 ~ "Obese",
TRUE ~ NA_character_
),
bmi_categorical = factor(
bmi_categorical,
levels = c("Underweight", "Normal weight", "Overweight", "Obese")
)
)
# Step 6: Descriptive Statistics for Imputed and Non-Imputed Rows
# Descriptive stats for all rows
all_stats <- main %>%
summarise(
Mean = mean(BMI, na.rm = TRUE),
Median = median(BMI, na.rm = TRUE),
Q1 = quantile(BMI, 0.25, na.rm = TRUE),
Q3 = quantile(BMI, 0.75, na.rm = TRUE),
Min = min(BMI, na.rm = TRUE),
Max = max(BMI, na.rm = TRUE),
Count = n()
)
# Descriptive stats for imputed rows
imputed_stats <- main %>%
filter(height_imputed == 1 | weight_imputed == 1) %>%
summarise(
Mean = mean(BMI, na.rm = TRUE),
Median = median(BMI, na.rm = TRUE),
Q1 = quantile(BMI, 0.25, na.rm = TRUE),
Q3 = quantile(BMI, 0.75, na.rm = TRUE),
Min = min(BMI, na.rm = TRUE),
Max = max(BMI, na.rm = TRUE),
Count = n()
)
# Descriptive stats for non-imputed rows
non_imputed_stats <- main %>%
filter(height_imputed == 0 & weight_imputed == 0) %>%
summarise(
Mean = mean(BMI, na.rm = TRUE),
Median = median(BMI, na.rm = TRUE),
Q1 = quantile(BMI, 0.25, na.rm = TRUE),
Q3 = quantile(BMI, 0.75, na.rm = TRUE),
Min = min(BMI, na.rm = TRUE),
Max = max(BMI, na.rm = TRUE),
Count = n()
)
print("Descriptive Statistics for All Rows:")## [1] "Descriptive Statistics for All Rows:"
## # A tibble: 1 × 7
## Mean Median Q1 Q3 Min Max Count
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 26.9 26.2 23.3 29.7 11.0 95.2 34149
## [1] "Descriptive Statistics for Imputed Rows:"
## # A tibble: 1 × 7
## Mean Median Q1 Q3 Min Max Count
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 27.2 26.4 23.5 29.8 12.6 95.2 20070
## [1] "Descriptive Statistics for Non-Imputed Rows:"
## # A tibble: 1 × 7
## Mean Median Q1 Q3 Min Max Count
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 26.8 26.0 23.1 29.4 11.0 95.2 14079
# Step 7: Visualizations
# Histogram for all rows
ggplot(main, aes(x = BMI)) +
geom_histogram(binwidth = 1, fill = "purple", color = "black", alpha = 0.7) +
labs(
title = "BMI Distribution - All Rows",
x = "BMI",
y = "Count"
) +
theme_minimal()## Warning: Removed 13891 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Histogram for imputed rows
ggplot(main %>% filter(height_imputed == 1 | weight_imputed == 1), aes(x = BMI)) +
geom_histogram(binwidth = 1, fill = "orange", color = "black", alpha = 0.7) +
labs(
title = "BMI Distribution - Imputed Rows",
x = "BMI",
y = "Count"
) +
theme_minimal()## Warning: Removed 13891 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Histogram for non-imputed rows
ggplot(main %>% filter(height_imputed == 0 & weight_imputed == 0), aes(x = BMI)) +
geom_histogram(binwidth = 1, fill = "green", color = "black", alpha = 0.7) +
labs(
title = "BMI Distribution - Non-Imputed Rows",
x = "BMI",
y = "Count"
) +
theme_minimal()# Create `smoking_dummy`
# Step 1: Track Missing Categories Before Imputation
missing_breakdown_before <- main %>%
summarise(
ple0081_h_NA = sum(is.na(ple0081_h)),
ple0081_h_invalid = sum(ple0081_h %in% c(-5, -8), na.rm = TRUE),
ple0086_v4_NA = sum(is.na(ple0086_v4)),
ple0086_v4_invalid = sum(ple0086_v4 %in% c(-5, -8), na.rm = TRUE)
)
print("Breakdown of missing and invalid values before imputation:")## [1] "Breakdown of missing and invalid values before imputation:"
## # A tibble: 1 × 4
## ple0081_h_NA ple0081_h_invalid ple0086_v4_NA ple0086_v4_invalid
## <int> <int> <int> <int>
## 1 0 19591 0 21578
# Step 2: Impute Smoking Status (ple0081_h) with Lag and Lead (Max 2 Years)
main <- main %>%
arrange(pid, syear) %>%
group_by(pid) %>%
mutate(
smoking_imputed = if_else(ple0081_h %in% c(-5, -8), 1, 0),
ple0081_h = case_when(
ple0081_h %in% c(-5, -8) ~ {
lag_val <- if_else(syear - lag(syear) <= 2, lag(ple0081_h), NA_real_)
lead_val <- if_else(lead(syear) - syear <= 2, lead(ple0081_h), NA_real_)
if_else(!is.na(lag_val) & !is.na(lead_val), (lag_val + lead_val) / 2, coalesce(lag_val, lead_val))
},
ple0081_h == -2 ~ 2, # Treat -2 as "no" (2)
ple0081_h > 0 ~ ple0081_h,
TRUE ~ NA_real_
),
smoking_dummy = if_else(ple0081_h == 1, 1, 0) # Create smoking dummy (1 = Smoker, 0 = Non-Smoker)
) %>%
ungroup()
# Step 1: Track Missing Categories Before Imputation
missing_breakdown_before <- main %>%
summarise(
ple0086_v4_NA = sum(is.na(ple0086_v4)),
ple0086_v4_invalid = sum(ple0086_v4 %in% c(-5, -8), na.rm = TRUE),
ple0086_v4_negative = sum(ple0086_v4 < 0, na.rm = TRUE)
)
print("Breakdown of invalid values before imputation:")## [1] "Breakdown of invalid values before imputation:"
## # A tibble: 1 × 3
## ple0086_v4_NA ple0086_v4_invalid ple0086_v4_negative
## <int> <int> <int>
## 1 0 21578 34034
# Step 2: Impute Number of Cigarettes with Lag/Lead (Max 2 Years)
main <- main %>%
arrange(pid, syear) %>%
group_by(pid) %>%
mutate(
cigarettes_imputed = if_else(ple0086_v4 %in% c(-5, -8), 1, 0), # Flag imputed rows
ple0086_v4 = case_when(
ple0086_v4 %in% c(-5, -8) ~ {
# Impute using lag and lead within 2 years
lag_val <- if_else(syear - lag(syear) <= 2, lag(ple0086_v4), NA_real_)
lead_val <- if_else(lead(syear) - syear <= 2, lead(ple0086_v4), NA_real_)
valid_val <- if_else(!is.na(lag_val) & !is.na(lead_val), (lag_val + lead_val) / 2, coalesce(lag_val, lead_val))
if_else(valid_val >= 0, valid_val, NA_real_) # Ensure valid positive values only
},
ple0086_v4 == -2 ~ 0, # Treat -2 as 0
ple0086_v4 >= 0 ~ ple0086_v4, # Keep valid positive values
TRUE ~ NA_real_ # Set other invalid cases to NA
),
nb_of_cigarettes = ple0086_v4
) %>%
ungroup()
# Step 3: Remove Remaining Invalid or Negative Values
main <- main %>%
mutate(nb_of_cigarettes = if_else(nb_of_cigarettes < 0, NA_real_, nb_of_cigarettes)) %>%
filter(!is.na(nb_of_cigarettes))
# Step 4: Track Missing Categories After Imputation
missing_breakdown_after <- main %>%
summarise(
ple0081_h_NA = sum(is.na(ple0081_h)),
ple0086_v4_NA = sum(is.na(ple0086_v4))
)
print("Breakdown of missing values after imputation:")## [1] "Breakdown of missing values after imputation:"
## # A tibble: 1 × 2
## ple0081_h_NA ple0086_v4_NA
## <int> <int>
## 1 9 0
# Step 5: Descriptive Statistics
# Smoking Dummy
smoking_stats_all <- main %>%
summarise(
Mean_Smoking = mean(smoking_dummy, na.rm = TRUE),
Count = n()
)
smoking_stats_imputed <- main %>%
filter(smoking_imputed == 1) %>%
summarise(
Mean_Smoking = mean(smoking_dummy, na.rm = TRUE),
Count = n()
)
smoking_stats_non_imputed <- main %>%
filter(smoking_imputed == 0) %>%
summarise(
Mean_Smoking = mean(smoking_dummy, na.rm = TRUE),
Count = n()
)
print("Descriptive Statistics for Smoking (All Rows):")## [1] "Descriptive Statistics for Smoking (All Rows):"
## # A tibble: 1 × 2
## Mean_Smoking Count
## <dbl> <int>
## 1 0.257 12633
## [1] "Descriptive Statistics for Smoking (Imputed Rows):"
## # A tibble: 1 × 2
## Mean_Smoking Count
## <dbl> <int>
## 1 0.887 62
## [1] "Descriptive Statistics for Smoking (Non-Imputed Rows):"
## # A tibble: 1 × 2
## Mean_Smoking Count
## <dbl> <int>
## 1 0.254 12571
# Number of Cigarettes
cigarettes_stats_all <- main %>%
summarise(
Mean_Cigarettes = mean(nb_of_cigarettes, na.rm = TRUE),
Median = median(nb_of_cigarettes, na.rm = TRUE),
Q1 = quantile(nb_of_cigarettes, 0.25, na.rm = TRUE),
Q3 = quantile(nb_of_cigarettes, 0.75, na.rm = TRUE),
Min = min(nb_of_cigarettes, na.rm = TRUE),
Max = max(nb_of_cigarettes, na.rm = TRUE),
Count = n()
)
cigarettes_stats_imputed <- main %>%
filter(cigarettes_imputed == 1) %>%
summarise(
Mean_Cigarettes = mean(nb_of_cigarettes, na.rm = TRUE),
Median = median(nb_of_cigarettes, na.rm = TRUE),
Q1 = quantile(nb_of_cigarettes, 0.25, na.rm = TRUE),
Q3 = quantile(nb_of_cigarettes, 0.75, na.rm = TRUE),
Min = min(nb_of_cigarettes, na.rm = TRUE),
Max = max(nb_of_cigarettes, na.rm = TRUE),
Count = n()
)
cigarettes_stats_non_imputed <- main %>%
filter(cigarettes_imputed == 0) %>%
summarise(
Mean_Cigarettes = mean(nb_of_cigarettes, na.rm = TRUE),
Median = median(nb_of_cigarettes, na.rm = TRUE),
Q1 = quantile(nb_of_cigarettes, 0.25, na.rm = TRUE),
Q3 = quantile(nb_of_cigarettes, 0.75, na.rm = TRUE),
Min = min(nb_of_cigarettes, na.rm = TRUE),
Max = max(nb_of_cigarettes, na.rm = TRUE),
Count = n()
)
print("Descriptive Statistics for Number of Cigarettes (All Rows):")## [1] "Descriptive Statistics for Number of Cigarettes (All Rows):"
## # A tibble: 1 × 7
## Mean_Cigarettes Median Q1 Q3 Min Max Count
## <dbl> <dbl> <dbl> <dbl> <dbl+lbl> <dbl+lbl> <int>
## 1 0.106 0 0 0 0 30 12633
## [1] "Descriptive Statistics for Number of Cigarettes (Imputed Rows):"
## # A tibble: 1 × 7
## Mean_Cigarettes Median Q1 Q3 Min Max Count
## <dbl> <dbl> <dbl> <dbl> <dbl+lbl> <dbl+lbl> <int>
## 1 7.30 5 3 10 0 30 64
## [1] "Descriptive Statistics for Number of Cigarettes (Non-Imputed Rows):"
## # A tibble: 1 × 7
## Mean_Cigarettes Median Q1 Q3 Min Max Count
## <dbl> <dbl> <dbl> <dbl> <dbl+lbl> <dbl+lbl> <int>
## 1 0.0695 0 0 0 0 30 12569
# Step 6: Visualizations
# Smoking Dummy
ggplot(main, aes(x = factor(smoking_dummy))) +
geom_bar(fill = "skyblue", color = "black") +
labs(title = "Smoking Status - All Rows", x = "Smoking Status (1 = Yes, 0 = No)", y = "Count") +
theme_minimal()ggplot(main %>% filter(smoking_imputed == 1), aes(x = factor(smoking_dummy))) +
geom_bar(fill = "orange", color = "black") +
labs(title = "Smoking Status - Imputed Rows", x = "Smoking Status (1 = Yes, 0 = No)", y = "Count") +
theme_minimal()ggplot(main %>% filter(smoking_imputed == 0), aes(x = factor(smoking_dummy))) +
geom_bar(fill = "green", color = "black") +
labs(title = "Smoking Status - Non-Imputed Rows", x = "Smoking Status (1 = Yes, 0 = No)", y = "Count") +
theme_minimal()# Number of Cigarettes
ggplot(main, aes(x = nb_of_cigarettes)) +
geom_histogram(binwidth = 1, fill = "purple", color = "black") +
labs(title = "Number of Cigarettes - All Rows", x = "Cigarettes Per Day", y = "Count") +
theme_minimal()ggplot(main %>% filter(cigarettes_imputed == 1), aes(x = nb_of_cigarettes)) +
geom_histogram(binwidth = 1, fill = "orange", color = "black") +
labs(title = "Number of Cigarettes - Imputed Rows", x = "Cigarettes Per Day", y = "Count") +
theme_minimal()ggplot(main %>% filter(cigarettes_imputed == 0), aes(x = nb_of_cigarettes)) +
geom_histogram(binwidth = 1, fill = "green", color = "black") +
labs(title = "Number of Cigarettes - Non-Imputed Rows", x = "Cigarettes Per Day", y = "Count") +
theme_minimal()# Create diet_healthy and diet_dummy
main <- main %>%
mutate(
ple0095 = ifelse(ple0095 < 0, NA_real_, ple0095),
diet_categorical = factor(
ple0095, levels = c(1, 2, 3, 4),
labels = c("very healthy", "somewhat healthy", "somewhat unhealthy", "very unhealthy")
),
diet_dummy = case_when(
ple0095 %in% c(1, 2) ~ 1,
ple0095 %in% c(3, 4) ~ 0,
TRUE ~ NA_real_
)
) %>%
select(-ple0095)
#Visusalition: Bar Plot
ggplot(main, aes(x = diet_categorical)) +
geom_bar(fill = "darkgreen", color = "black") +
labs(
title = "Distribution of Diet Categories",
x = "Diet Category",
y = "Count"
) +
theme_minimal()# Create variables for worries about job security, financial situation, and economic situation
# Step 1: Track Missing Categories Before Imputation
missing_breakdown_before <- main %>%
summarise(
plh0042_NA = sum(is.na(plh0042)),
plh0042_invalid = sum(plh0042 %in% c(-5, -8), na.rm = TRUE),
plh0033_NA = sum(is.na(plh0033)),
plh0033_invalid = sum(plh0033 %in% c(-5, -8), na.rm = TRUE),
plh0032_NA = sum(is.na(plh0032)),
plh0032_invalid = sum(plh0032 %in% c(-5, -8), na.rm = TRUE)
)
print("Breakdown of missing values before imputation:")## [1] "Breakdown of missing values before imputation:"
## # A tibble: 1 × 6
## plh0042_NA plh0042_invalid plh0033_NA plh0033_invalid plh0032_NA
## <int> <int> <int> <int> <int>
## 1 0 0 0 0 0
## # ℹ 1 more variable: plh0032_invalid <int>
# Step 2: Lag/Lead Imputation (Max 2 Years)
main <- main %>%
arrange(pid, syear) %>% # Ensure data is ordered by pid and syear
group_by(pid) %>%
mutate(
# Impute -5 or -8 for `plh0042` with max 2-year lag/lead
worries_job_imputed = if_else(plh0042 %in% c(-5, -8), 1, 0),
plh0042 = case_when(
plh0042 %in% c(-5, -8) ~ {
lag_val <- if_else(syear - lag(syear) <= 2, lag(plh0042), NA_real_)
lead_val <- if_else(lead(syear) - syear <= 2, lead(plh0042), NA_real_)
if_else(!is.na(lag_val) & !is.na(lead_val), (lag_val + lead_val) / 2, coalesce(lag_val, lead_val))
},
TRUE ~ plh0042
),
# Repeat for `plh0033`
worries_financial_imputed = if_else(plh0033 %in% c(-5, -8), 1, 0),
plh0033 = case_when(
plh0033 %in% c(-5, -8) ~ {
lag_val <- if_else(syear - lag(syear) <= 2, lag(plh0033), NA_real_)
lead_val <- if_else(lead(syear) - syear <= 2, lead(plh0033), NA_real_)
if_else(!is.na(lag_val) & !is.na(lead_val), (lag_val + lead_val) / 2, coalesce(lag_val, lead_val))
},
TRUE ~ plh0033
),
# Repeat for `plh0032`
worries_economic_imputed = if_else(plh0032 %in% c(-5, -8), 1, 0),
plh0032 = case_when(
plh0032 %in% c(-5, -8) ~ {
lag_val <- if_else(syear - lag(syear) <= 2, lag(plh0032), NA_real_)
lead_val <- if_else(lead(syear) - syear <= 2, lead(plh0032), NA_real_)
if_else(!is.na(lag_val) & !is.na(lead_val), (lag_val + lead_val) / 2, coalesce(lag_val, lead_val))
},
TRUE ~ plh0032
)
) %>%
ungroup()
# Step 3: Create Categorical and Dummy Variables
main <- main %>%
mutate(
worr_job_categorical = factor(case_when(plh0042 == -2 ~ 3, TRUE ~ plh0042),
levels = c(3, 2, 1), labels = c("no worries", "worries a little", "worries a lot")),
worr_financial_categorical = factor(case_when(plh0033 == -2 ~ 3, TRUE ~ plh0033),
levels = c(3, 2, 1), labels = c("no worries", "worries a little", "worries a lot")),
worr_economic_categorical = factor(case_when(plh0032 == -2 ~ 3, TRUE ~ plh0032),
levels = c(3, 2, 1), labels = c("no worries", "worries a little", "worries a lot")),
worr_job_dummy = ifelse(worr_job_categorical %in% c("worries a lot", "worries a little"), 1, 0),
worr_financial_dummy = ifelse(worr_financial_categorical %in% c("worries a lot", "worries a little"), 1, 0),
worr_economic_dummy = ifelse(worr_economic_categorical %in% c("worries a lot", "worries a little"), 1, 0)
)# Define descriptive_stats function
descriptive_stats <- function(data, condition) {
data %>%
filter(!!rlang::enquo(condition)) %>% # Use enquo for capturing condition
summarise(
Job_Worries_Mean = mean(as.numeric(worr_job_dummy), na.rm = TRUE),
Financial_Worries_Mean = mean(as.numeric(worr_financial_dummy), na.rm = TRUE),
Economic_Worries_Mean = mean(as.numeric(worr_economic_dummy), na.rm = TRUE),
Count = n()
)
}
# Calculate statistics for all rows, imputed rows, and non-imputed rows
all_stats <- descriptive_stats(main, TRUE)
imputed_stats <- descriptive_stats(main, worries_job_imputed == 1 | worries_financial_imputed == 1 | worries_economic_imputed == 1)
non_imputed_stats <- descriptive_stats(main, worries_job_imputed == 0 & worries_financial_imputed == 0 & worries_economic_imputed == 0)
print("Descriptive Statistics for All Rows:")## [1] "Descriptive Statistics for All Rows:"
## # A tibble: 1 × 4
## Job_Worries_Mean Financial_Worries_Mean Economic_Worries_Mean Count
## <dbl> <dbl> <dbl> <int>
## 1 0.188 0.683 0.835 12633
## [1] "Descriptive Statistics for Imputed Rows:"
## # A tibble: 1 × 4
## Job_Worries_Mean Financial_Worries_Mean Economic_Worries_Mean Count
## <dbl> <dbl> <dbl> <int>
## 1 NaN NaN NaN 0
## [1] "Descriptive Statistics for Non-Imputed Rows:"
## # A tibble: 1 × 4
## Job_Worries_Mean Financial_Worries_Mean Economic_Worries_Mean Count
## <dbl> <dbl> <dbl> <int>
## 1 0.188 0.683 0.835 12633
# Step 5: Visualizations
# Drop rows with NAs in worry-related columns
main <- main %>%
drop_na(worr_job_categorical, worr_financial_categorical, worr_economic_categorical)
# Distribution of Worry Levels - All Rows
main %>%
select(worr_job_categorical, worr_financial_categorical, worr_economic_categorical) %>%
pivot_longer(cols = everything(), names_to = "Worry_Type", values_to = "Worry_Level") %>%
ggplot(aes(x = Worry_Level, fill = Worry_Type)) +
geom_bar(position = "dodge", color = "black") +
labs(
title = "Distribution of Worry Levels - All Rows",
x = "Worry Level",
y = "Count",
fill = "Worry Type"
) +
theme_minimal()# Imputed Rows
main %>%
filter(worries_job_imputed == 1 | worries_financial_imputed == 1 | worries_economic_imputed == 1) %>%
pivot_longer(cols = c(worr_job_categorical, worr_financial_categorical, worr_economic_categorical),
names_to = "Worry_Type", values_to = "Worry_Level") %>%
ggplot(aes(x = Worry_Level, fill = Worry_Type)) +
geom_bar(position = "dodge", color = "black") +
labs(
title = "Distribution of Worry Levels - Imputed Rows",
x = "Worry Level",
y = "Count",
fill = "Worry Type"
) +
theme_minimal()# Non-Imputed Rows
main %>%
filter(worries_job_imputed == 0 & worries_financial_imputed == 0 & worries_economic_imputed == 0) %>%
pivot_longer(cols = c(worr_job_categorical, worr_financial_categorical, worr_economic_categorical),
names_to = "Worry_Type", values_to = "Worry_Level") %>%
ggplot(aes(x = Worry_Level, fill = Worry_Type)) +
geom_bar(position = "dodge", color = "black") +
labs(
title = "Distribution of Worry Levels - Non-Imputed Rows",
x = "Worry Level",
y = "Count",
fill = "Worry Type"
) +
theme_minimal()# Step 1: Breakdown Before Imputation
missing_breakdown_before <- main %>%
summarise(
NA_count = sum(is.na(pgerwzeit)),
not_asked = sum(pgerwzeit %in% c(-8, -5), na.rm = TRUE),
other_neg = sum(pgerwzeit < 0 & !(pgerwzeit %in% c(-8, -5)), na.rm = TRUE)
)
print("Breakdown of missing values before imputation:")## [1] "Breakdown of missing values before imputation:"
## # A tibble: 1 × 3
## NA_count not_asked other_neg
## <int> <int> <int>
## 1 0 0 7503
# Step 2: Impute Tenure with Year Difference Logic and Max 2-Year Lag/Lead
main <- main %>%
arrange(pid, syear) %>%
group_by(pid) %>%
mutate(
tenure = ifelse(pgerwzeit %in% c(-5, -8), NA_real_, pgerwzeit),
tenure = {
temp <- tenure
years <- syear
for (i in seq_len(n())) {
if (is.na(temp[i])) {
lag_idx <- which(!is.na(temp) & years < years[i] & (years[i] - years <= 2))
lead_idx <- which(!is.na(temp) & years > years[i] & (years - years[i] <= 2))
if (length(lag_idx) > 0 && length(lead_idx) > 0) {
lag_diff <- years[i] - years[lag_idx[length(lag_idx)]]
lead_diff <- years[lead_idx[1]] - years[i]
nearest <- ifelse(lag_diff <= lead_diff, lag_idx[length(lag_idx)], lead_idx[1])
temp[i] <- temp[nearest] + ifelse(nearest %in% lag_idx, lag_diff, -lead_diff)
} else if (length(lag_idx) > 0) {
lag_diff <- years[i] - years[lag_idx[length(lag_idx)]]
temp[i] <- temp[lag_idx[length(lag_idx)]] + lag_diff
} else if (length(lead_idx) > 0) {
lead_diff <- years[lead_idx[1]] - years[i]
temp[i] <- temp[lead_idx[1]] - lead_diff
}
}
}
temp
},
tenure = ifelse(is.na(tenure) & pgerwzeit == -2, 0, tenure)
) %>%
ungroup() %>%
mutate(
tenure = ifelse(tenure < 0, NA_real_, tenure), # Remove any remaining negatives
imputed_flag = if_else(is.na(pgerwzeit) | pgerwzeit %in% c(-8, -5), 1, 0) # Track imputed rows
)
# Step 3: Breakdown After Imputation
missing_breakdown_after <- main %>%
summarise(
NA_count_after = sum(is.na(tenure)),
imputed_count = sum(imputed_flag)
)
print("Breakdown of missing values after imputation:")## [1] "Breakdown of missing values after imputation:"
## # A tibble: 1 × 2
## NA_count_after imputed_count
## <int> <dbl>
## 1 7503 0
# Step 4: Descriptive Statistics for Imputed and Non-Imputed Rows
# Imputed Rows
imputed_stats <- main %>%
filter(imputed_flag == 1) %>%
summarise(
Mean_Tenure = mean(tenure, na.rm = TRUE),
Median_Tenure = median(tenure, na.rm = TRUE),
Q1 = quantile(tenure, 0.25, na.rm = TRUE),
Q3 = quantile(tenure, 0.75, na.rm = TRUE),
Min = min(tenure, na.rm = TRUE),
Max = max(tenure, na.rm = TRUE),
Count = n()
)## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `Min = min(tenure, na.rm = TRUE)`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# Non-Imputed Rows
non_imputed_stats <- main %>%
filter(imputed_flag == 0) %>%
summarise(
Mean_Tenure = mean(tenure, na.rm = TRUE),
Median_Tenure = median(tenure, na.rm = TRUE),
Q1 = quantile(tenure, 0.25, na.rm = TRUE),
Q3 = quantile(tenure, 0.75, na.rm = TRUE),
Min = min(tenure, na.rm = TRUE),
Max = max(tenure, na.rm = TRUE),
Count = n()
)
print("Descriptive Statistics for Imputed Rows:")## [1] "Descriptive Statistics for Imputed Rows:"
## # A tibble: 1 × 7
## Mean_Tenure Median_Tenure Q1 Q3 Min Max Count
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 NaN NA NA NA Inf -Inf 0
## [1] "Descriptive Statistics for Non-Imputed Rows:"
## # A tibble: 1 × 7
## Mean_Tenure Median_Tenure Q1 Q3 Min Max Count
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 11.6 7.92 2.42 18.5 0 62.1 12370
# Step 5: Visualizations
# All Rows
ggplot(main, aes(x = tenure)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(
title = "Distribution of Tenure - All Rows",
x = "Tenure (Years)",
y = "Count"
) +
theme_minimal()## Warning: Removed 7503 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Imputed Rows Only
ggplot(main %>% filter(imputed_flag == 1), aes(x = tenure)) +
geom_histogram(binwidth = 1, fill = "orange", color = "black") +
labs(
title = "Distribution of Tenure - Imputed Rows",
x = "Tenure (Years)",
y = "Count"
) +
theme_minimal()# Non-Imputed Rows Only
ggplot(main %>% filter(imputed_flag == 0), aes(x = tenure)) +
geom_histogram(binwidth = 1, fill = "green", color = "black") +
labs(
title = "Distribution of Tenure - Non-Imputed Rows",
x = "Tenure (Years)",
y = "Count"
) +
theme_minimal()## Warning: Removed 7503 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Transform `pglabnet` into `net_income`
# Transform `pglabnet` into `net_income`
main <- main %>%
arrange(pid, syear) %>% # Ensure data is ordered by pid and syear
group_by(pid) %>%
mutate(
# Replace invalid values with NA
net_income = ifelse(pglabnet %in% c(-5, -8, 99999), NA_real_, pglabnet),
# Iterative imputation using lag and lead with a max 2-year difference
net_income = {
temp <- net_income
years <- syear
for (i in seq_len(n())) {
if (is.na(temp[i])) {
# Find valid lag and lead indices within a 2-year difference
lag_idx <- which(!is.na(temp) & years < years[i] & (years[i] - years <= 2))
lead_idx <- which(!is.na(temp) & years > years[i] & (years - years[i] <= 2))
if (length(lag_idx) > 0 && length(lead_idx) > 0) {
lag_diff <- years[i] - years[lag_idx[length(lag_idx)]]
lead_diff <- years[lead_idx[1]] - years[i]
nearest <- ifelse(lag_diff <= lead_diff, lag_idx[length(lag_idx)], lead_idx[1])
temp_value <- temp[nearest]
temp[i] <- ifelse(temp_value < 0, NA_real_, temp_value) # Set NA if negative
} else if (length(lag_idx) > 0) {
temp_value <- temp[lag_idx[length(lag_idx)]]
temp[i] <- ifelse(temp_value < 0, NA_real_, temp_value) # Set NA if negative
} else if (length(lead_idx) > 0) {
temp_value <- temp[lead_idx[1]]
temp[i] <- ifelse(temp_value < 0, NA_real_, temp_value) # Set NA if negative
}
}
}
temp
},
# Set -2 to NA
net_income = ifelse(net_income < 0, NA_real_, net_income) # Ensure no negative values
) %>%
ungroup() %>%
mutate(
# Remove remaining NA values in ALL rows
net_income = ifelse(is.na(net_income), NA_real_, net_income),
imputed_flag_net_income = ifelse(is.na(pglabnet) | pglabnet %in% c(-5, -8, 99999), 1, 0)
) %>%
filter(!is.na(net_income)) # Remove rows with NA net_income
# Transform `pgtatzeit` into `work_time_weekly`
main <- main %>%
arrange(pid, syear) %>%
group_by(pid) %>%
mutate(
# Replace invalid values with NA
work_time_weekly = ifelse(pgtatzeit %in% c(-5, -8), NA_real_, pgtatzeit),
# Iterative imputation using lag and lead with a max 2-year difference
work_time_weekly = {
temp <- work_time_weekly
years <- syear
for (i in seq_len(n())) {
if (is.na(temp[i])) {
# Find valid lag and lead indices within a 2-year difference
lag_idx <- which(!is.na(temp) & years < years[i] & (years[i] - years <= 2))
lead_idx <- which(!is.na(temp) & years > years[i] & (years - years[i] <= 2))
if (length(lag_idx) > 0 && length(lead_idx) > 0) {
lag_diff <- years[i] - years[lag_idx[length(lag_idx)]]
lead_diff <- years[lead_idx[1]] - years[i]
nearest <- ifelse(lag_diff <= lead_diff, lag_idx[length(lag_idx)], lead_idx[1])
temp_value <- temp[nearest]
temp[i] <- ifelse(temp_value < 0, NA_real_, temp_value) # Set NA if negative
} else if (length(lag_idx) > 0) {
temp_value <- temp[lag_idx[length(lag_idx)]]
temp[i] <- ifelse(temp_value < 0, NA_real_, temp_value) # Set NA if negative
} else if (length(lead_idx) > 0) {
temp_value <- temp[lead_idx[1]]
temp[i] <- ifelse(temp_value < 0, NA_real_, temp_value) # Set NA if negative
}
}
}
temp
},
# Set negative values to NA
work_time_weekly = ifelse(work_time_weekly < 0, NA_real_, work_time_weekly)
) %>%
ungroup() %>%
mutate(
# Remove remaining NA values in ALL rows
work_time_weekly = ifelse(is.na(work_time_weekly), NA_real_, work_time_weekly),
imputed_flag_work_time = ifelse(is.na(pgtatzeit) | pgtatzeit %in% c(-5, -8), 1, 0)
) %>%
filter(!is.na(work_time_weekly)) # Remove rows with NA work_time_weekly
# Summary Statistics for Net Income
# All rows
net_income_all <- main %>%
summarise(
Mean = mean(net_income, na.rm = TRUE),
Median = median(net_income, na.rm = TRUE),
Min = min(net_income, na.rm = TRUE),
Max = max(net_income, na.rm = TRUE),
Count = sum(!is.na(net_income))
)
# Imputed rows
net_income_imputed <- main %>%
filter(imputed_flag_net_income == 1) %>%
summarise(
Mean = mean(net_income, na.rm = TRUE),
Median = median(net_income, na.rm = TRUE),
Min = min(net_income, na.rm = TRUE),
Max = max(net_income, na.rm = TRUE),
Count = sum(!is.na(net_income))
)## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `Min = min(net_income, na.rm = TRUE)`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# Non-imputed rows
net_income_non_imputed <- main %>%
filter(imputed_flag_net_income == 0) %>%
summarise(
Mean = mean(net_income, na.rm = TRUE),
Median = median(net_income, na.rm = TRUE),
Min = min(net_income, na.rm = TRUE),
Max = max(net_income, na.rm = TRUE),
Count = sum(!is.na(net_income))
)
# Summary Statistics for Work Time Weekly
# All rows
work_time_all <- main %>%
summarise(
Mean = mean(work_time_weekly, na.rm = TRUE),
Median = median(work_time_weekly, na.rm = TRUE),
Min = min(work_time_weekly, na.rm = TRUE),
Max = max(work_time_weekly, na.rm = TRUE),
Count = sum(!is.na(work_time_weekly))
)
# Imputed rows
work_time_imputed <- main %>%
filter(imputed_flag_work_time == 1) %>%
summarise(
Mean = mean(work_time_weekly, na.rm = TRUE),
Median = median(work_time_weekly, na.rm = TRUE),
Min = min(work_time_weekly, na.rm = TRUE),
Max = max(work_time_weekly, na.rm = TRUE),
Count = sum(!is.na(work_time_weekly))
)## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `Min = min(work_time_weekly, na.rm = TRUE)`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# Non-imputed rows
work_time_non_imputed <- main %>%
filter(imputed_flag_work_time == 0) %>%
summarise(
Mean = mean(work_time_weekly, na.rm = TRUE),
Median = median(work_time_weekly, na.rm = TRUE),
Min = min(work_time_weekly, na.rm = TRUE),
Max = max(work_time_weekly, na.rm = TRUE),
Count = sum(!is.na(work_time_weekly))
)
# Display Results
print("Net Income Summary Statistics - All Rows:")## [1] "Net Income Summary Statistics - All Rows:"
## # A tibble: 1 × 5
## Mean Median Min Max Count
## <dbl> <dbl> <dbl> <dbl> <int>
## 1 1654. 1430 0 24000 4757
## [1] "Net Income Summary Statistics - Imputed Rows:"
## # A tibble: 1 × 5
## Mean Median Min Max Count
## <dbl> <dbl> <dbl> <dbl> <int>
## 1 NaN NA Inf -Inf 0
## [1] "Net Income Summary Statistics - Non-Imputed Rows:"
## # A tibble: 1 × 5
## Mean Median Min Max Count
## <dbl> <dbl> <dbl> <dbl> <int>
## 1 1654. 1430 0 24000 4757
## [1] "\nWork Time Weekly Summary Statistics - All Rows:"
## # A tibble: 1 × 5
## Mean Median Min Max Count
## <dbl> <dbl> <dbl> <dbl> <int>
## 1 36.6 40 1 80 4757
## [1] "Work Time Weekly Summary Statistics - Imputed Rows:"
## # A tibble: 1 × 5
## Mean Median Min Max Count
## <dbl> <dbl> <dbl> <dbl> <int>
## 1 NaN NA Inf -Inf 0
## [1] "Work Time Weekly Summary Statistics - Non-Imputed Rows:"
## # A tibble: 1 × 5
## Mean Median Min Max Count
## <dbl> <dbl> <dbl> <dbl> <int>
## 1 36.6 40 1 80 4757
# Plot for Net Income - All Rows
ggplot(main, aes(x = net_income)) +
geom_histogram(binwidth = 500, fill = "skyblue", color = "black", alpha = 0.7) +
labs(
title = "Net Income Distribution - All Rows",
x = "Net Income",
y = "Frequency"
) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))# Plot for Net Income - Imputed Rows
ggplot(main %>% filter(imputed_flag_net_income == 1), aes(x = net_income)) +
geom_histogram(binwidth = 500, fill = "orange", color = "black", alpha = 0.7) +
labs(
title = "Net Income Distribution - Imputed Rows",
x = "Net Income",
y = "Frequency"
) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))# Plot for Net Income - Non-Imputed Rows
ggplot(main %>% filter(imputed_flag_net_income == 0), aes(x = net_income)) +
geom_histogram(binwidth = 500, fill = "green", color = "black", alpha = 0.7) +
labs(
title = "Net Income Distribution - Non-Imputed Rows",
x = "Net Income",
y = "Frequency"
) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))# Plot for Work Time Weekly - All Rows
ggplot(main, aes(x = work_time_weekly)) +
geom_histogram(binwidth = 5, fill = "lightcoral", color = "black", alpha = 0.7) +
labs(
title = "Work Time Weekly Distribution - All Rows",
x = "Work Time (Hours/Week)",
y = "Frequency"
) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))# Plot for Work Time Weekly - Imputed Rows
ggplot(main %>% filter(imputed_flag_work_time == 1), aes(x = work_time_weekly)) +
geom_histogram(binwidth = 5, fill = "gold", color = "black", alpha = 0.7) +
labs(
title = "Work Time Weekly Distribution - Imputed Rows",
x = "Work Time (Hours/Week)",
y = "Frequency"
) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))# Plot for Work Time Weekly - Non-Imputed Rows
ggplot(main %>% filter(imputed_flag_work_time == 0), aes(x = work_time_weekly)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black", alpha = 0.7) +
labs(
title = "Work Time Weekly Distribution - Non-Imputed Rows",
x = "Work Time (Hours/Week)",
y = "Frequency"
) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))# Transform total years unemployed (pgexpue)
# Step 1: Breakdown Before Imputation
missing_breakdown_before <- main %>%
summarise(
NA_count = sum(is.na(pgexpue)),
not_asked = sum(pgexpue %in% c(-8, -5), na.rm = TRUE),
other_neg = sum(pgexpue < 0 & !(pgexpue %in% c(-8, -5)), na.rm = TRUE)
)
print("Breakdown of missing values before imputation:")## [1] "Breakdown of missing values before imputation:"
## # A tibble: 1 × 3
## NA_count not_asked other_neg
## <int> <int> <int>
## 1 0 0 46
# Step 2: Impute Total Years Unemployment with Year Difference Logic and Max 2-Year Lag/Lead
main <- main %>%
arrange(pid, syear) %>%
group_by(pid) %>%
mutate(
total_years_unemployment = ifelse(pgexpue %in% c(-5, -8), NA_real_, pgexpue),
total_years_unemployment = {
temp <- total_years_unemployment
years <- syear
for (i in seq_len(n())) {
if (is.na(temp[i])) {
lag_idx <- which(!is.na(temp) & years < years[i] & (years[i] - years <= 2))
lead_idx <- which(!is.na(temp) & years > years[i] & (years - years[i] <= 2))
if (length(lag_idx) > 0 && length(lead_idx) > 0) {
lag_diff <- years[i] - years[lag_idx[length(lag_idx)]]
lead_diff <- years[lead_idx[1]] - years[i]
nearest <- ifelse(lag_diff <= lead_diff, lag_idx[length(lag_idx)], lead_idx[1])
temp[i] <- temp[nearest] - abs(years[i] - years[nearest]) # Adjust for year difference
} else if (length(lag_idx) > 0) {
lag_diff <- years[i] - years[lag_idx[length(lag_idx)]]
temp[i] <- temp[lag_idx[length(lag_idx)]] - lag_diff
} else if (length(lead_idx) > 0) {
lead_diff <- years[lead_idx[1]] - years[i]
temp[i] <- temp[lead_idx[1]] - lead_diff
}
}
}
temp
},
total_years_unemployment = ifelse(is.na(total_years_unemployment) & pgexpue == -2, 0, total_years_unemployment)
) %>%
ungroup() %>%
mutate(
total_years_unemployment = ifelse(total_years_unemployment < 0, NA_real_, total_years_unemployment), # Remove negatives
imputed_flag = if_else(is.na(pgexpue) | pgexpue %in% c(-8, -5), 1, 0) # Track imputed rows
)
# Step 3: Breakdown After Imputation
missing_breakdown_after <- main %>%
summarise(
NA_count_after = sum(is.na(total_years_unemployment)),
imputed_count = sum(imputed_flag)
)
print("Breakdown of missing values after imputation:")## [1] "Breakdown of missing values after imputation:"
## # A tibble: 1 × 2
## NA_count_after imputed_count
## <int> <dbl>
## 1 46 0
# Step 4: Descriptive Statistics for Imputed and Non-Imputed Rows
# Imputed Rows
imputed_stats <- main %>%
filter(imputed_flag == 1) %>%
summarise(
Mean_Years_Unemployed = mean(total_years_unemployment, na.rm = TRUE),
Median_Years_Unemployed = median(total_years_unemployment, na.rm = TRUE),
Q1 = quantile(total_years_unemployment, 0.25, na.rm = TRUE),
Q3 = quantile(total_years_unemployment, 0.75, na.rm = TRUE),
Min = min(total_years_unemployment, na.rm = TRUE),
Max = max(total_years_unemployment, na.rm = TRUE),
Count = n()
)## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `Min = min(total_years_unemployment, na.rm = TRUE)`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# Non-Imputed Rows
non_imputed_stats <- main %>%
filter(imputed_flag == 0) %>%
summarise(
Mean_Years_Unemployed = mean(total_years_unemployment, na.rm = TRUE),
Median_Years_Unemployed = median(total_years_unemployment, na.rm = TRUE),
Q1 = quantile(total_years_unemployment, 0.25, na.rm = TRUE),
Q3 = quantile(total_years_unemployment, 0.75, na.rm = TRUE),
Min = min(total_years_unemployment, na.rm = TRUE),
Max = max(total_years_unemployment, na.rm = TRUE),
Count = n()
)
print("Descriptive Statistics for Imputed Rows:")## [1] "Descriptive Statistics for Imputed Rows:"
## # A tibble: 1 × 7
## Mean_Years_Unemployed Median_Years_Unemployed Q1 Q3 Min Max Count
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 NaN NA NA NA Inf -Inf 0
## [1] "Descriptive Statistics for Non-Imputed Rows:"
## # A tibble: 1 × 7
## Mean_Years_Unemployed Median_Years_Unemployed Q1 Q3 Min Max Count
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 0.707 0 0 0.5 0 27.2 4757
# Step 5: Visualizations
# All Rows
ggplot(main, aes(x = total_years_unemployment)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(
title = "Distribution of Total Years Unemployed - All Rows",
x = "Total Years Unemployed",
y = "Count"
) +
theme_minimal()## Warning: Removed 46 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Imputed Rows Only
ggplot(main %>% filter(imputed_flag == 1), aes(x = total_years_unemployment)) +
geom_histogram(binwidth = 1, fill = "orange", color = "black") +
labs(
title = "Distribution of Total Years Unemployed - Imputed Rows",
x = "Total Years Unemployed",
y = "Count"
) +
theme_minimal()# Non-Imputed Rows Only
ggplot(main %>% filter(imputed_flag == 0), aes(x = total_years_unemployment)) +
geom_histogram(binwidth = 1, fill = "green", color = "black") +
labs(
title = "Distribution of Total Years Unemployed - Non-Imputed Rows",
x = "Total Years Unemployed",
y = "Count"
) +
theme_minimal()## Warning: Removed 46 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Transform occupational status (pgstib)
main <- main %>%
mutate(
occup_status = case_when(
between(pgstib, 210, 250) ~ 1, # Blue collars
between(pgstib, 510, 550) ~ 2, # White collars
between(pgstib, 610, 640) | pgstib == 15 ~ 3, # Public servant
between(pgstib, 410, 440) ~ 4, # Self-employed
pgstib == 12 ~ 5, # Unemployed
pgstib == 10 ~ 6, # Non-working
between(pgstib, 110, 150) | pgstib %in% c(11, 13) ~ 7, # Apprentice, retired
TRUE ~ 8 # Assign a default category for unmatched values
)
) %>%
# Drop the original `pgstib` column
select(-pgstib)
# Add labels to `occup_status`, including the new "other" category
occup_status_labels <- c(
"1" = "blue collar",
"2" = "white collar",
"3" = "public servant",
"4" = "self-employed",
"5" = "unemployed",
"6" = "non-working",
"7" = "apprentice, retired",
"8" = "other"
)
main$occup_status <- factor(main$occup_status, levels = 1:8, labels = occup_status_labels)
#Visualisation: Bar Plot
ggplot(main, aes(x = occup_status)) +
geom_bar(fill = "darkred", color = "black") +
labs(
title = "Distribution of Occupational Status",
x = "Occupational Status",
y = "Count"
) +
theme_minimal()# Transform sex into male dummy
main <- main %>%
mutate(
male = case_when(
sex == 1 ~ 1,
sex == 2 ~ 0,
TRUE ~ NA_real_
)
) %>%
group_by(pid) %>%
mutate(male = if_else(is.na(male), first(na.omit(male)), male)) %>%
ungroup() %>%
drop_na(male) %>%
select(-sex)
#Visualisation: Pie Chart
main %>%
group_by(male) %>%
summarize(count = n()) %>%
mutate(
proportion = count / sum(count),
label = paste0(round(proportion * 100, 1), "%")
) %>%
ggplot(aes(x = "", y = proportion, fill = factor(male, labels = c("Female", "Male")))) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y") +
labs(
title = "Proportion of Male vs. Female",
fill = "Gender"
) +
theme_minimal() +
geom_text(aes(label = label), position = position_stack(vjust = 0.5))# Transform migration background and create dummy
main <- main %>%
mutate(
migback = case_when(
migback == -2 ~ 1,
migback < 0 ~ NA_real_,
TRUE ~ migback
)
) %>%
group_by(pid) %>%
mutate(migback = if_else(is.na(migback), first(na.omit(migback)), migback)) %>%
ungroup() %>%
drop_na(migback) %>%
mutate(
migback = factor(
migback,
levels = c(1, 2, 3),
labels = c("no migration", "direct migration", "indirect migration")
),
migback_dummy = ifelse(migback %in% c("direct migration", "indirect migration"), 1, 0)
)
#Visualisation: Bar plot
ggplot(main, aes(x = migback)) +
geom_bar(fill = "cyan", color = "black") +
labs(
title = "Distribution of Migration Background",
x = "Migration Background",
y = "Count"
) +
theme_minimal()## Warning: Unknown or uninitialised column: `pgfamstd_dummy`.
## [1] "Number of NA rows in pgfamstd_dummy before imputation: 0"
# Transform marital status and handle NA with lag/lead logic
main <- main %>%
arrange(pid, syear) %>% # Ensure data is ordered by pid and syear
group_by(pid) %>%
mutate(
pgfamstd = case_when(
pgfamstd < 0 ~ NA_real_, # Replace negative values with NA
TRUE ~ pgfamstd # Keep the original value otherwise
),
pgfamstd_dummy = case_when(
pgfamstd %in% c(1, 6, 7, 8) ~ "married",
pgfamstd %in% c(2, 3, 4, 5) ~ "single",
TRUE ~ NA_character_ # Handle unexpected missings
),
# Impute NA values using lag and lead logic
pgfamstd_dummy = {
temp <- pgfamstd_dummy
years <- syear
for (i in seq_len(n())) {
if (is.na(temp[i])) {
lag_idx <- which(!is.na(temp) & years < years[i] & (years[i] - years <= 2))
lead_idx <- which(!is.na(temp) & years > years[i] & (years - years[i] <= 2))
if (length(lag_idx) > 0 && length(lead_idx) > 0) {
lag_diff <- years[i] - years[lag_idx[length(lag_idx)]]
lead_diff <- years[lead_idx[1]] - years[i]
nearest <- ifelse(lag_diff <= lead_diff, lag_idx[length(lag_idx)], lead_idx[1])
temp[i] <- temp[nearest]
} else if (length(lag_idx) > 0) {
temp[i] <- temp[lag_idx[length(lag_idx)]]
} else if (length(lead_idx) > 0) {
temp[i] <- temp[lead_idx[1]]
}
}
}
temp
}
) %>%
ungroup() %>%
mutate(
imputed_flag = if_else(is.na(pgfamstd) | pgfamstd < 0, 1, 0) # Track imputed rows
)
# Drop all rows with NA in `pgfamstd_dummy`
main <- main %>%
filter(!is.na(pgfamstd_dummy))
# Summary Statistics
# All Rows
all_stats <- main %>%
summarise(
Married = sum(pgfamstd_dummy == "married", na.rm = TRUE),
Single = sum(pgfamstd_dummy == "single", na.rm = TRUE),
Total = n()
)
# Imputed Rows
imputed_stats <- main %>%
filter(imputed_flag == 1) %>%
summarise(
Married = sum(pgfamstd_dummy == "married", na.rm = TRUE),
Single = sum(pgfamstd_dummy == "single", na.rm = TRUE),
Total = n()
)
# Non-Imputed Rows
non_imputed_stats <- main %>%
filter(imputed_flag == 0) %>%
summarise(
Married = sum(pgfamstd_dummy == "married", na.rm = TRUE),
Single = sum(pgfamstd_dummy == "single", na.rm = TRUE),
Total = n()
)
print("Summary Statistics for All Rows:")## [1] "Summary Statistics for All Rows:"
## # A tibble: 1 × 3
## Married Single Total
## <int> <int> <int>
## 1 2903 1829 4732
## [1] "Summary Statistics for Imputed Rows:"
## # A tibble: 1 × 3
## Married Single Total
## <int> <int> <int>
## 1 1 1 2
## [1] "Summary Statistics for Non-Imputed Rows:"
## # A tibble: 1 × 3
## Married Single Total
## <int> <int> <int>
## 1 2902 1828 4730
# Visualizations
# All Rows
ggplot(main, aes(x = factor(pgfamstd_dummy))) +
geom_bar(fill = "skyblue", color = "black") +
labs(
title = "Distribution of Family Status - All Rows",
x = "Family Status",
y = "Count"
) +
theme_minimal()# Imputed Rows Only
ggplot(main %>% filter(imputed_flag == 1), aes(x = factor(pgfamstd_dummy))) +
geom_bar(fill = "orange", color = "black") +
labs(
title = "Distribution of Family Status - Imputed Rows",
x = "Family Status",
y = "Count"
) +
theme_minimal()# Non-Imputed Rows Only
ggplot(main %>% filter(imputed_flag == 0), aes(x = factor(pgfamstd_dummy))) +
geom_bar(fill = "green", color = "black") +
labs(
title = "Distribution of Family Status - Non-Imputed Rows",
x = "Family Status",
y = "Count"
) +
theme_minimal()main <- main %>%
mutate(
germborn = case_when(
germborn == 2 ~ 0,
germborn < 0 ~ NA_real_,
TRUE ~ germborn
)
) %>%
group_by(pid) %>%
mutate(germborn = if_else(is.na(germborn), first(na.omit(germborn)), germborn)) %>%
ungroup() %>%
drop_na(germborn)
#Visualisation: Bar Plot
ggplot(main, aes(x = factor(germborn, labels = c("Not Born in Germany", "Born in Germany")))) +
geom_bar(fill = "pink", color = "black") +
labs(
title = "Proportion of Individuals Born in Germany",
x = "Born in Germany",
y = "Count"
) +
theme_minimal()# Create age and filter age range
main <- main %>%
arrange(pid, syear) %>% # Ensure data is sorted by pid and syear
group_by(pid) %>%
mutate(
# Impute `ple0010_h` using the closest valid value in the same group
ple0010_h = if_else(
is.na(ple0010_h),
ple0010_h[!is.na(ple0010_h)] + (syear - syear[!is.na(ple0010_h)]), # Calculate from closest year
ple0010_h
)
) %>%
ungroup() %>%
mutate(
age = syear - ple0010_h # Calculate age based on imputed or existing `ple0010_h`
) %>%
filter(!is.na(ple0010_h) & ple0010_h > 0) %>% # Filter out invalid entries
filter(age >= 25 & age <= 55) %>% # Keep only rows within the age range
select(-ple0010_h) # Drop `ple0010_h` after processing
# Visualisation: Histogram
ggplot(main, aes(x = age)) +
geom_histogram(binwidth = 5, fill = "gold", color = "black") +
labs(
title = "Distribution of Age (25-55 Years)",
x = "Age",
y = "Count"
) +
theme_minimal()## [1] "pid" "syear"
## [3] "health" "worr_health_categorical"
## [5] "ple0055" "ple0056"
## [7] "ple0072" "ple0081_h"
## [9] "ple0086_v4" "ple0080_v2"
## [11] "ple0086_v1" "ple0011"
## [13] "ple0012" "ple0013"
## [15] "ple0014" "ple0015"
## [17] "ple0016" "ple0017"
## [19] "ple0018" "ple0019"
## [21] "ple0020" "ple0021"
## [23] "ple0022" "ple0023"
## [25] "ple0004" "ple0005"
## [27] "plh0182" "plh0171"
## [29] "plh0042" "plh0033"
## [31] "plh0032" "ple0006"
## [33] "ple0007" "pgerwzeit"
## [35] "pgemplst" "pglabnet"
## [37] "pgtatzeit" "pgexpue"
## [39] "pgfamstd" "hid"
## [41] "psample" "gebjahr"
## [43] "migback" "germborn"
## [45] "worr_health_dummy" "past_health"
## [47] "health_diff" "health_in_2yrs"
## [49] "health_decline_2yrs" "gs01"
## [51] "gripr1" "gripr2"
## [53] "gripl1" "gripl2"
## [55] "avg_gripr" "avg_gripl"
## [57] "avg_grip" "ple0055_flag"
## [59] "ple0056_flag" "imputed_flag"
## [61] "hospital_stay" "ple0004_imputed"
## [63] "ple0005_imputed" "impaired_stairs"
## [65] "impaired_tiring_tasks" "ple0011_imputed"
## [67] "ple0012_imputed" "ple0013_imputed"
## [69] "ple0014_imputed" "ple0015_imputed"
## [71] "ple0016_imputed" "ple0017_imputed"
## [73] "ple0018_imputed" "ple0019_imputed"
## [75] "ple0020_imputed" "ple0021_imputed"
## [77] "ple0022_imputed" "ple0023_imputed"
## [79] "illnesses" "imputed_flag_life"
## [81] "imputed_flag_health" "life_satisfaction"
## [83] "health_satisfaction" "height_imputed"
## [85] "height" "weight_imputed"
## [87] "weight" "BMI"
## [89] "bmi_categorical" "smoking_imputed"
## [91] "smoking_dummy" "cigarettes_imputed"
## [93] "nb_of_cigarettes" "diet_categorical"
## [95] "diet_dummy" "worries_job_imputed"
## [97] "worries_financial_imputed" "worries_economic_imputed"
## [99] "worr_job_categorical" "worr_financial_categorical"
## [101] "worr_economic_categorical" "worr_job_dummy"
## [103] "worr_financial_dummy" "worr_economic_dummy"
## [105] "tenure" "net_income"
## [107] "imputed_flag_net_income" "work_time_weekly"
## [109] "imputed_flag_work_time" "total_years_unemployment"
## [111] "occup_status" "male"
## [113] "migback_dummy" "pgfamstd_dummy"
## [115] "age"
# Select only the relevant variables
main <- main %>%
select(
pid, syear, # Unique identifiers
health, worr_health_categorical, worr_health_dummy,
migback, germborn, hospital_stay, smoking_dummy, nb_of_cigarettes, illnesses,
impaired_stairs, impaired_tiring_tasks, diet_categorical, diet_dummy, avg_grip,
life_satisfaction, health_satisfaction,
height, weight, BMI, bmi_categorical, worr_job_categorical, worr_financial_categorical,
worr_economic_categorical, worr_job_dummy, worr_financial_dummy, worr_economic_dummy,
tenure, net_income, work_time_weekly, total_years_unemployment,
occup_status, male, migback_dummy, age
)
#Display the summary of the dataset
skim(main)| Name | main |
| Number of rows | 3336 |
| Number of columns | 36 |
| _______________________ | |
| Column type frequency: | |
| factor | 8 |
| numeric | 28 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| worr_health_categorical | 0 | 1.00 | FALSE | 3 | wor: 1815, wor: 833, no : 688 |
| migback | 0 | 1.00 | FALSE | 3 | no : 2673, dir: 498, ind: 165 |
| diet_categorical | 1437 | 0.57 | FALSE | 4 | som: 875, som: 759, ver: 160, ver: 105 |
| bmi_categorical | 32 | 0.99 | FALSE | 4 | Nor: 1414, Ove: 1151, Obe: 684, Und: 55 |
| worr_job_categorical | 0 | 1.00 | FALSE | 3 | no : 1684, wor: 1181, wor: 471 |
| worr_financial_categorical | 0 | 1.00 | FALSE | 3 | wor: 1784, no : 838, wor: 714 |
| worr_economic_categorical | 0 | 1.00 | FALSE | 3 | wor: 1999, wor: 843, no : 494 |
| occup_status | 0 | 1.00 | FALSE | 6 | whi: 1974, blu: 833, sel: 245, pub: 242 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| pid | 0 | 1.00 | 10398312.66 | 11303900.38 | 5203.00 | 2303352.00 | 5028151.00 | 20377276.00 | 35031602.00 | ▇▁▁▁▂ |
| syear | 0 | 1.00 | 2010.65 | 5.48 | 2000.00 | 2006.00 | 2012.00 | 2016.00 | 2018.00 | ▃▆▅▆▇ |
| health | 0 | 1.00 | 3.17 | 0.96 | 1.00 | 3.00 | 3.00 | 4.00 | 5.00 | ▁▅▇▇▁ |
| worr_health_dummy | 0 | 1.00 | 0.79 | 0.40 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | ▂▁▁▁▇ |
| germborn | 0 | 1.00 | 0.85 | 0.36 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | ▂▁▁▁▇ |
| hospital_stay | 0 | 1.00 | 9.50 | 12.80 | 0.00 | 4.00 | 6.00 | 11.00 | 210.00 | ▇▁▁▁▁ |
| smoking_dummy | 1 | 1.00 | 0.35 | 0.48 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▅ |
| nb_of_cigarettes | 0 | 1.00 | 0.12 | 1.31 | 0.00 | 0.00 | 0.00 | 0.00 | 25.00 | ▇▁▁▁▁ |
| illnesses | 0 | 1.00 | 0.00 | 0.09 | 0.00 | 0.00 | 0.00 | 0.00 | 4.00 | ▇▁▁▁▁ |
| impaired_stairs | 0 | 1.00 | 0.40 | 0.49 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▆ |
| impaired_tiring_tasks | 0 | 1.00 | 0.52 | 0.50 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▇▁▁▁▇ |
| diet_dummy | 1437 | 0.57 | 0.48 | 0.50 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▇ |
| avg_grip | 2719 | 0.18 | 389.81 | 117.90 | 45.00 | 301.00 | 365.00 | 475.00 | 775.00 | ▁▇▇▅▁ |
| life_satisfaction | 12 | 1.00 | 6.91 | 1.83 | -1.00 | 6.00 | 7.00 | 8.00 | 10.00 | ▁▁▂▆▇ |
| health_satisfaction | 12 | 1.00 | 6.04 | 2.29 | -1.00 | 5.00 | 6.00 | 8.00 | 10.00 | ▁▃▆▇▇ |
| height | 6 | 1.00 | 172.58 | 9.52 | 105.00 | 165.00 | 172.00 | 180.00 | 208.00 | ▁▁▃▇▁ |
| weight | 32 | 0.99 | 79.34 | 18.18 | 35.00 | 65.00 | 78.00 | 90.00 | 215.00 | ▅▇▁▁▁ |
| BMI | 32 | 0.99 | 26.53 | 5.35 | 11.05 | 22.99 | 25.66 | 29.24 | 95.24 | ▇▃▁▁▁ |
| worr_job_dummy | 0 | 1.00 | 0.50 | 0.50 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▇ |
| worr_financial_dummy | 0 | 1.00 | 0.75 | 0.43 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▃▁▁▁▇ |
| worr_economic_dummy | 0 | 1.00 | 0.85 | 0.36 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | ▂▁▁▁▇ |
| tenure | 1 | 1.00 | 10.09 | 8.98 | 0.00 | 2.50 | 7.33 | 15.71 | 40.17 | ▇▃▂▁▁ |
| net_income | 0 | 1.00 | 1672.95 | 1282.51 | 0.00 | 892.75 | 1500.00 | 2100.00 | 24000.00 | ▇▁▁▁▁ |
| work_time_weekly | 0 | 1.00 | 36.73 | 13.31 | 1.00 | 30.00 | 40.00 | 45.00 | 80.00 | ▂▃▇▂▁ |
| total_years_unemployment | 36 | 0.99 | 0.75 | 2.01 | 0.00 | 0.00 | 0.00 | 0.50 | 23.08 | ▇▁▁▁▁ |
| male | 0 | 1.00 | 0.45 | 0.50 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▆ |
| migback_dummy | 0 | 1.00 | 0.20 | 0.40 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| age | 0 | 1.00 | 42.46 | 8.40 | 25.00 | 36.00 | 44.00 | 50.00 | 55.00 | ▅▅▆▇▇ |